#include <chrono>
#include <iostream>
#include <fstream>
#include <vector>
#include <random>
#include <format>
#include <future>
#include <string>
#include <bitset>
#include <thread>
#include <mutex>
#include <condition_variable>
#include <pthread.h>
#include <windows.h>
#include <unistd.h>
#include <stdlib.h>

#include "d2sci.h"

const int NUM_F64 = 1024 * 3 * 4 * 500 ;

void print_double(double value)
{
    unsigned long long int_value;
    memcpy(&int_value, &value, sizeof(value));
    std::cout << "binary double : "
              << std::bitset<1>(int_value >> 63) << ' '
              << std::bitset<11>((int_value >> 52) & 0x7FF) << ' '
              << std::bitset<52>(int_value & 0xFFFFFFFFFFFFF) << ' '
              << "ieee754 exp=" << ((int(int_value >> 52) & 0x7FF) - 1023) << '\n';
}

void test_fprintf(double *data, int len)
{
    FILE *f = std::fopen("./data/fprintf_write.txt", "wb");
    for (int i = 0; i < len; i += 3)
    {
        fprintf(f, "%.16le %.16le %.16le\n", data[i], data[i + 1], data[i + 2]);
    }
    fclose(f);
}
void test_sprintf_multithread(double *data, int len)
{
    FILE *f = std::fopen("./data/sprintf_write_multithread.txt", "wb");

    const int thread_num = 4;                                                 // 线程数量
    const int every_cycle = 2000 * 3;                                         // 每个线程每轮写入3K个数
    const int all_cycle = static_cast<int>(len) / (every_cycle * thread_num); // 总轮数
    const int MAX_SIZE = (4 + (24) * 3) * every_cycle;                        // 3K个 "%.16le %.16le %.16le\n" 的最大长度

    char *buffer = (char *)malloc(2 * MAX_SIZE * thread_num * sizeof(char));
    char *buffer2 = &buffer[MAX_SIZE * thread_num];
    std::vector<std::thread> producerThreads(thread_num);
    std::vector<int> producerThreads_len1(thread_num);
    std::vector<int> producerThreads_len2(thread_num);
    std::vector<std::mutex> producerThreads_mutex(thread_num);
    std::vector<std::condition_variable> producerThreads_cv(thread_num);
    std::vector<int> producerThreads_buf1_can_write(thread_num);
    std::vector<int> producerThreads_buf2_can_write(thread_num);
    auto producer = [&producerThreads_len1, &producerThreads_len2, &producerThreads_mutex, &producerThreads_cv, &producerThreads_buf1_can_write, &producerThreads_buf2_can_write](double *data, int len, int id, int all_cycle, int last_num, int all_thread_num, char *buffer1, char *buffer2)
    {
        // len需是3的倍数
        int number_row = len / 3;
        int every_cycle_start_pos = id * len;
        int stride = all_thread_num * len;
        for (int i = 0; i < all_cycle; ++i)
        {
            int start_pos = i * stride + every_cycle_start_pos;
            char *buffer = (i % 2 == 0) ? buffer1 : buffer2;
            int length = 0;
            std::unique_lock lk(producerThreads_mutex[id]);
            producerThreads_cv[id].wait(lk, [i, id, &producerThreads_buf1_can_write, &producerThreads_buf2_can_write]
                                        { return (i % 2 == 0) ? producerThreads_buf1_can_write[id] : producerThreads_buf2_can_write[id]; });
            for (int j = 0; j < number_row; ++j)
            {
                // length += d2sci(data[start_pos + j * 3 + 0], buffer + length);
                // buffer[length++] = ' ';
                // length += d2sci(data[start_pos + j * 3 + 1], buffer + length);
                // buffer[length++] = ' ';
                // length += d2sci(data[start_pos + j * 3 + 2], buffer + length);
                // buffer[length++] = '\n';
                length += sprintf(&buffer[length], "%.16le %.16le %.16le\n",
                                                data[start_pos + j * 3 + 0],
                                                data[start_pos + j * 3 + 1],
                                                data[start_pos + j * 3 + 2]);
            }
            if (i % 2 == 0)
            {
                producerThreads_len1[id] = length;
                producerThreads_buf1_can_write[id] = !producerThreads_buf1_can_write[id]; // 设置缓冲区1为不可写状态，即可读状态
            }
            else
            {
                producerThreads_len2[id] = length;
                producerThreads_buf2_can_write[id] = !producerThreads_buf2_can_write[id]; // 设置缓冲区2为不可写状态，即可读状态
            }
            lk.unlock();
            producerThreads_cv[id].notify_one();
        }
    };
    auto consumer = [&](FILE *file, int all_cycle, int all_thread_num, int MAX_SIZE, char *buffer1, char *buffer2) -> void
    {
        for (int i = 0; i < all_cycle; i++)
        {
            for (int j = 0; j < all_thread_num; j++)
            {
                std::unique_lock lk(producerThreads_mutex[j]);
                producerThreads_cv[j].wait(lk, [&, i]
                                           { return (i % 2 == 0) ? !producerThreads_buf1_can_write[j] : !producerThreads_buf2_can_write[j]; });

                char *buffer = (i % 2 == 0) ? buffer1 : buffer2;
                int start_pos = j * MAX_SIZE;
                int length = i % 2 == 0 ? producerThreads_len1[j] : producerThreads_len2[j];
                fwrite(&buffer[start_pos], sizeof(char), length, file);
                if (i % 2 == 0)
                {
                    producerThreads_buf1_can_write[j] = !producerThreads_buf1_can_write[j]; // 设置缓冲区1为不可写状态，即可读状态
                }
                else
                {
                    producerThreads_buf2_can_write[j] = !producerThreads_buf2_can_write[j]; // 设置缓冲区2为不可写状态，即可读状态
                }
                lk.unlock();
                producerThreads_cv[j].notify_one();
            }
        }
    };
    for (int i = 0; i < thread_num; ++i)
    {
        producerThreads_len1[i] = 0;
        producerThreads_len2[i] = 0;
        producerThreads_buf1_can_write[i] = 1;
        producerThreads_buf2_can_write[i] = 1;
        producerThreads[i] = std::thread(producer, data, every_cycle, i, all_cycle, 0, thread_num, &buffer[i * MAX_SIZE], &buffer2[i * MAX_SIZE]);
    }
    std::thread consumer_thread(consumer, f, all_cycle, thread_num, MAX_SIZE, buffer, buffer2);
    for (int i = 0; i < thread_num; ++i)
    {
        producerThreads[i].join();
    }
    consumer_thread.join();

    // std::vector<char> buffer(MAX_SIZE * thread_num);                          // 利用双缓冲，1个buffer写，另1个buffer读，循环交替
    // std::vector<char> buffer2(MAX_SIZE * thread_num);
    // std::vector<std::future<int>> all_task(thread_num + 1); // 1个线程写(即消费),其余线程生产
    // //setvbuf(f, 0, _IOFBF, 64 * 1024);                       // 64KB缓冲区
    // if (all_cycle > 0)
    // {
    //     for (int __i = 0; __i <= (all_cycle); ++__i)
    //     {
    //         if (__i > 0)
    //         { // 第一次不执行，从buffer写入到文件
    //             int leng[thread_num];
    //             for (int thread_idx = 0; thread_idx < thread_num; ++thread_idx)
    //             {
    //                 leng[thread_idx] = all_task[thread_idx].get(); // 获取每个线程要写入的长度
    //                 // get()会同步线程。等待前6个线程处理完成。
    //             }
    //             char *buffer_ptr = ((__i % 2) == 0) ? buffer2.data() : buffer.data();
    //             all_task[thread_num] = std::async(std::launch::async, [__i, leng, &buffer_ptr, thread_num, MAX_SIZE, f]
    //                                               {
    //                                                   int write_byte_sum = 0;
    //                                                   for (int thread_idx = 0; thread_idx < thread_num; ++thread_idx)
    //                                                   {
    //                                                       int write_byte_num = fwrite(&(buffer_ptr[thread_idx * MAX_SIZE]), sizeof(char), leng[thread_idx], f); // 写入到文件
    //                                                       write_byte_sum += write_byte_num;
    //                                                   }
    //                                                   return write_byte_sum; // 这里1没有用处，任何值都行
    //                                               });
    //         }
    //         if (__i < all_cycle)
    //         { // 最后一次不执行，从内存处理后写入到buffer
    //             char *buffer_ptr = ((__i % 2) == 0) ? buffer.data() : buffer2.data();
    //             for (int thread_idx = 0; thread_idx < thread_num; ++thread_idx)
    //             {
    //                 all_task[thread_idx] = std::async(std::launch::async, [__i, &buffer_ptr, thread_idx, &data, every_cycle, MAX_SIZE, thread_num]
    //                                                   {
    //                 int all_byte_sum = 0;
    //                 char* char_ptr = &(buffer_ptr[thread_idx * MAX_SIZE]);
    //                 int j = 0;
    //                 for (int idx = __i * (every_cycle * thread_num) + thread_idx * every_cycle; j < every_cycle; (idx+=3),(j+=3)) {
    //                     int byte_num = sprintf_s(char_ptr, ((4 + (16 + 8) * 3)), "%.16le %.16le %.16le\n", data[idx+0], data[idx+1], data[idx+2]);
    //                     char_ptr += byte_num;
    //                     all_byte_sum += byte_num;
    //                 }
    //                 return all_byte_sum; });
    //             }
    //             if (__i > 0)
    //                 all_task[thread_num].wait(); // 需等待写入完成
    //         }
    //     }
    //     all_task[thread_num].wait(); // 等待最后一次写入完成。
    // }
    // if (all_cycle * every_cycle * thread_num < static_cast<int>(len))
    // {
    //     for (int _idx = all_cycle * every_cycle * thread_num; _idx < static_cast<int>(len); _idx += 3)
    //     {
    //         fprintf_s(f, "%.16le %.16le %.16le\n", data[_idx + 0], data[_idx + 1], data[_idx + 2]);
    //     }
    // }

    free(buffer);
    fclose(f);
}
void test_format(double *data, int len)
{
    std::ofstream f("./data/format_write.txt");
    for (int i = 0; i < len; i += 3)
    {
        f << std::format("{:.16e} {:.16e} {:.16e}", data[i], data[i + 1], data[i + 2]);
    }
    f.close();
}
void test_format_multithread(double *data, int len)
{
    FILE *f = std::fopen("./data/format_write_multithread.txt", "wb");

    const int thread_num = 4;                                                 // 线程数量
    const int every_cycle = 2048 * 3;                                         // 每个线程每轮写入3K个数
    const int all_cycle = static_cast<int>(len) / (every_cycle * thread_num); // 总轮数
    const int MAX_SIZE = (4 + (24) * 3)* every_cycle;                     // 3K个 "%.16le %.16le %.16le\n" 的最大长度


    char *buffer = (char *)malloc(2 * MAX_SIZE * thread_num * sizeof(char));
    char *buffer2 = &buffer[MAX_SIZE * thread_num];
    std::vector<std::thread> producerThreads(thread_num);
    std::vector<int> producerThreads_len1(thread_num);
    std::vector<int> producerThreads_len2(thread_num);
    std::vector<std::mutex> producerThreads_mutex(thread_num);
    std::vector<std::condition_variable> producerThreads_cv(thread_num);
    std::vector<int> producerThreads_buf1_can_write(thread_num);
    std::vector<int> producerThreads_buf2_can_write(thread_num);
    auto producer = [&producerThreads_len1, &producerThreads_len2, &producerThreads_mutex, &producerThreads_cv, &producerThreads_buf1_can_write, &producerThreads_buf2_can_write](double *data, int len, int id, int all_cycle, int last_num, int all_thread_num, char *buffer1, char *buffer2)
    {
        // len需是3的倍数
        int number_row = len / 3;
        int every_cycle_start_pos = id * len;
        int stride = all_thread_num * len;
        for (int i = 0; i < all_cycle; ++i)
        {
            int start_pos = i * stride + every_cycle_start_pos;
            char *buffer = (i % 2 == 0) ? buffer1 : buffer2;
            int length = 0;
            std::unique_lock lk(producerThreads_mutex[id]);
            producerThreads_cv[id].wait(lk, [i, id, &producerThreads_buf1_can_write, &producerThreads_buf2_can_write]
                                        { return (i % 2 == 0) ? producerThreads_buf1_can_write[id] : producerThreads_buf2_can_write[id]; });
            for (int j = 0; j < number_row; ++j)
            {
                // length += d2sci(data[start_pos + j * 3 + 0], buffer + length);
                // buffer[length++] = ' ';
                // length += d2sci(data[start_pos + j * 3 + 1], buffer + length);
                // buffer[length++] = ' ';
                // length += d2sci(data[start_pos + j * 3 + 2], buffer + length);
                // buffer[length++] = '\n';
                // length += sprintf(&buffer[length], "%.16le %.16le %.16le\n",
                //                                 data[start_pos + j * 3 + 0],
                //                                 data[start_pos + j * 3 + 1],
                //                                 data[start_pos + j * 3 + 2]);
                std::string str_tmp = std::format("{:.16e} {:.16e} {:.16e}\n", 
                data[start_pos + j * 3 + 0], 
                data[start_pos + j * 3 + 1], 
                data[start_pos + j * 3 + 2]);
                memcpy(&buffer[length], str_tmp.c_str(), str_tmp.length());
                length += str_tmp.length();
            }
            if (i % 2 == 0)
            {
                producerThreads_len1[id] = length;
                producerThreads_buf1_can_write[id] = !producerThreads_buf1_can_write[id]; // 设置缓冲区1为不可写状态，即可读状态
            }
            else
            {
                producerThreads_len2[id] = length;
                producerThreads_buf2_can_write[id] = !producerThreads_buf2_can_write[id]; // 设置缓冲区2为不可写状态，即可读状态
            }
            lk.unlock();
            producerThreads_cv[id].notify_one();
        }
    };
    auto consumer = [&](FILE *file, int all_cycle, int all_thread_num, int MAX_SIZE, char *buffer1, char *buffer2) -> void
    {
        for (int i = 0; i < all_cycle; i++)
        {
            for (int j = 0; j < all_thread_num; j++)
            {
                std::unique_lock lk(producerThreads_mutex[j]);
                producerThreads_cv[j].wait(lk, [&, i]
                                           { return (i % 2 == 0) ? !producerThreads_buf1_can_write[j] : !producerThreads_buf2_can_write[j]; });

                char *buffer = (i % 2 == 0) ? buffer1 : buffer2;
                int start_pos = j * MAX_SIZE;
                int length = i % 2 == 0 ? producerThreads_len1[j] : producerThreads_len2[j];
                fwrite(&buffer[start_pos], sizeof(char), length, file);
                if (i % 2 == 0)
                {
                    producerThreads_buf1_can_write[j] = !producerThreads_buf1_can_write[j]; // 设置缓冲区1为不可写状态，即可读状态
                }
                else
                {
                    producerThreads_buf2_can_write[j] = !producerThreads_buf2_can_write[j]; // 设置缓冲区2为不可写状态，即可读状态
                }
                lk.unlock();
                producerThreads_cv[j].notify_one();
            }
        }
    };
    for (int i = 0; i < thread_num; ++i)
    {
        producerThreads_len1[i] = 0;
        producerThreads_len2[i] = 0;
        producerThreads_buf1_can_write[i] = 1;
        producerThreads_buf2_can_write[i] = 1;
        producerThreads[i] = std::thread(producer, data, every_cycle, i, all_cycle, 0, thread_num, &buffer[i * MAX_SIZE], &buffer2[i * MAX_SIZE]);
    }
    std::thread consumer_thread(consumer, f, all_cycle, thread_num, MAX_SIZE, buffer, buffer2);
    for (int i = 0; i < thread_num; ++i)
    {
        producerThreads[i].join();
    }
    consumer_thread.join();

    // std::vector<char> buffer(MAX_SIZE * thread_num);                          // 利用双缓冲，1个buffer写，另1个buffer读，循环交替
    // std::vector<char> buffer2(MAX_SIZE * thread_num);
    // std::vector<std::future<int>> all_task(thread_num + 1); // 1个线程写(即消费),其余线程生产
    // // setvbuf(f,0,_IOFBF,64*1024);//64KB缓冲区
    // if (all_cycle > 0)
    // {
    //     for (int __i = 0; __i <= (all_cycle); ++__i)
    //     {
    //         if (__i > 0)
    //         { // 第一次不执行，从buffer写入到文件
    //             int leng[thread_num];
    //             for (int thread_idx = 0; thread_idx < thread_num; ++thread_idx)
    //             {
    //                 leng[thread_idx] = all_task[thread_idx].get(); // 获取每个线程要写入的长度
    //                 // get()会同步线程。等待前6个线程处理完成。
    //             }
    //             char *buffer_ptr = ((__i % 2) == 0) ? buffer2.data() : buffer.data();
    //             all_task[thread_num] = std::async(std::launch::async, [__i, leng, &buffer_ptr, thread_num, MAX_SIZE, f]
    //                                               {
    //                                                   int write_byte_sum = 0;
    //                                                   for (int thread_idx = 0; thread_idx < thread_num; ++thread_idx)
    //                                                   {
    //                                                       int write_byte_num = fwrite(&(buffer_ptr[thread_idx * MAX_SIZE]), sizeof(char), leng[thread_idx], f); // 写入到文件
    //                                                       write_byte_sum += write_byte_num;
    //                                                   }
    //                                                   return write_byte_sum; // 这里没有用处，任何值都行
    //                                               });
    //         }
    //         if (__i < all_cycle)
    //         { // 最后一次不执行，从内存处理后写入到buffer
    //             char *buffer_ptr = ((__i % 2) == 0) ? buffer.data() : buffer2.data();
    //             for (int thread_idx = 0; thread_idx < thread_num; ++thread_idx)
    //             {
    //                 all_task[thread_idx] = std::async(std::launch::async, [__i, &buffer_ptr, thread_idx, &data, every_cycle, MAX_SIZE, thread_num]
    //                                                   {
    //                 int all_byte_sum = 0;
    //                 char* char_ptr = &(buffer_ptr[thread_idx * MAX_SIZE]);
    //                 int j = 0;
    //                 for (int idx = __i * (every_cycle * thread_num) + thread_idx * every_cycle; j < every_cycle; (idx+=3),(j+=3)) {
    //                     //int byte_num = sprintf_s(char_ptr, ((4 + (16 + 8) * 3)), "%.16le %.16le %.16le\n", data[idx+0], data[idx+1], data[idx+2]);
    //                     std::string str_tmp = std::format("{:.16e} {:.16e} {:.16e}\n", data[idx+0], data[idx+1], data[idx+2]);
    //                     memcpy(char_ptr, str_tmp.c_str(), str_tmp.size()); // 转为C字符串写入buffer
    //                     int byte_num=str_tmp.size();

    //                     char_ptr += byte_num;
    //                     all_byte_sum += byte_num;
    //                 }
    //                 return all_byte_sum; });
    //             }
    //             if (__i > 0)
    //                 all_task[thread_num].wait(); // 需等待写入完成
    //         }
    //     }
    //     all_task[thread_num].wait(); // 等待最后一次写入完成。
    // }
    // if (all_cycle * every_cycle * thread_num < static_cast<int>(len))
    // {
    //     for (int _idx = all_cycle * every_cycle * thread_num; _idx < static_cast<int>(len); _idx += 3)
    //     {
    //         // fprintf_s(f, "%.16le %.16le %.16le\n", data[_idx + 0], data[_idx + 1], data[_idx + 2]);
    //         std::string str_tmp = std::format("{:.16e} {:.16e} {:.16e}\n", data[_idx + 0], data[_idx + 1], data[_idx + 2]);
    //         fwrite(str_tmp.c_str(), sizeof(char), str_tmp.size(), f);
    //     }
    // }

    free(buffer);
    fclose(f);
}

void test_d2sci(double *data, int len)
{
    char buffer[32 * 3];
    const int K = 1000;
    std::vector<char> vbuffer(25 * 3 * K);
    FILE *f = std::fopen("K:/data/d2sci_write.txt", "wb");
    for (int i = 0; i < len; i += (3 * K))
    {
        int length = 0;
        for (int j = 0; j < K; ++j)
        {
            length += d2sci(data[i + j * 3 + 0], &vbuffer[length]);
            vbuffer[length++] = ' ';

            length += d2sci(data[i + j * 3 + 1], &vbuffer[length]);
            vbuffer[length++] = ' ';

            length += d2sci(data[i + j * 3 + 2], &vbuffer[length]);
            vbuffer[length++] = '\n';
        }

        fwrite(&vbuffer[0], sizeof(char), length, f);
    }
    fclose(f);
}

void test_d2sci_multithread(double *data, int len)
{
    FILE *f = std::fopen("K:/data/d2sci_write_multithread.txt", "wb");

    const int thread_num = 6;                                                 // 线程数量
    const int every_cycle = 2048 * 3;                                         // 每个线程每轮写入3K个数
    const int all_cycle = static_cast<int>(len) / (every_cycle * thread_num); // 总轮数
    const int last_cycle_num = static_cast<int>(len) % (every_cycle * thread_num);
    const int last_cycle_row = last_cycle_num / 3;
    const int last_cycle_every_thread_row = last_cycle_row / thread_num;
    const int last_cycle_every_thread_row_rest = last_cycle_row % thread_num;
    const int MAX_SIZE = (4 + (24) * 3) * every_cycle; // 3K个 "%.16le %.16le %.16le\n" 的最大长度

    // std::vector<char> buffer(MAX_SIZE * thread_num);                          // 利用双缓冲，1个buffer写，另1个buffer读，循环交替
    // std::vector<char> buffer2(MAX_SIZE * thread_num);

    char *buffer = (char *)malloc(2 * MAX_SIZE * thread_num * sizeof(char));
    char *buffer2 = &buffer[MAX_SIZE * thread_num];
    std::vector<std::thread> producerThreads(thread_num);
    std::vector<int> producerThreads_len1(thread_num);
    std::vector<int> producerThreads_len2(thread_num);
    std::vector<std::mutex> producerThreads_mutex(thread_num);
    std::vector<std::condition_variable> producerThreads_cv(thread_num);
    std::vector<int> producerThreads_buf1_can_write(thread_num);
    std::vector<int> producerThreads_buf2_can_write(thread_num);
    auto producer = [&producerThreads_len1, &producerThreads_len2, &producerThreads_mutex, &producerThreads_cv, &producerThreads_buf1_can_write, &producerThreads_buf2_can_write]
    (double *data, int len, int id, int all_cycle, int last_row,int last_row_rest, int all_thread_num, char *buffer1, char *buffer2)
    {
        // len需是3的倍数
        int number_row = len / 3;
        int every_cycle_start_pos = id * len;
        int stride = all_thread_num * len;
        for (int i = 0; i < all_cycle ; ++i)
        {
            int start_pos = i * stride + every_cycle_start_pos;
            char *buffer = (i % 2 == 0) ? buffer1 : buffer2;
            int length = 0;
            std::unique_lock lk(producerThreads_mutex[id]);
            producerThreads_cv[id].wait(lk, [i, id, &producerThreads_buf1_can_write, &producerThreads_buf2_can_write]
                                        { return (i % 2 == 0) ? producerThreads_buf1_can_write[id] : producerThreads_buf2_can_write[id]; });
            for (int j = 0; j < number_row; ++j)
            {
                length += d2sci(data[start_pos + j * 3 + 0], buffer + length);
                buffer[length++] = ' ';
                length += d2sci(data[start_pos + j * 3 + 1], buffer + length);
                buffer[length++] = ' ';
                length += d2sci(data[start_pos + j * 3 + 2], buffer + length);
                buffer[length++] = '\n';
            }
            if (i % 2 == 0)
            {
                producerThreads_len1[id] = length;
                producerThreads_buf1_can_write[id] = !producerThreads_buf1_can_write[id]; // 设置缓冲区1为不可写状态，即可读状态
            }
            else
            {
                producerThreads_len2[id] = length;
                producerThreads_buf2_can_write[id] = !producerThreads_buf2_can_write[id]; // 设置缓冲区2为不可写状态，即可读状态
            }
            lk.unlock();
            producerThreads_cv[id].notify_one();
        }
        //处理最后一轮
        int last_length = last_row + (id < last_row_rest);
        if(last_length > 0)
        {
            int start_pos = all_cycle * stride + (id < last_row_rest) ? id*last_length : last_row_rest * last_length + (id - last_row_rest) * last_row ;
            int i = all_cycle;
            char *buffer = (i % 2 == 0) ? buffer1 : buffer2;
            int length = 0;
            std::unique_lock lk(producerThreads_mutex[id]);
            producerThreads_cv[id].wait(lk, [i, id, &producerThreads_buf1_can_write, &producerThreads_buf2_can_write]
                                        { return (i % 2 == 0) ? producerThreads_buf1_can_write[id] : producerThreads_buf2_can_write[id]; });
            for(int j = 0; j < last_length; ++j)
            {
                length += d2sci(data[start_pos + j * 3 + 0], buffer + length);
                buffer[length++] = ' ';
                length += d2sci(data[start_pos + j * 3 + 1], buffer + length);
                buffer[length++] = ' ';
                length += d2sci(data[start_pos + j * 3 + 2], buffer + length);
                buffer[length++] = '\n';
            }
            if (i % 2 == 0)
            {
                producerThreads_len1[id] = length;
                producerThreads_buf1_can_write[id] = !producerThreads_buf1_can_write[id]; // 设置缓冲区1为不可写状态，即可读状态
            }
            else
            {
                producerThreads_len2[id] = length;
                producerThreads_buf2_can_write[id] = !producerThreads_buf2_can_write[id]; // 设置缓冲区2为不可写状态，即可读状态
            }
            lk.unlock();
            producerThreads_cv[id].notify_one();
        }
    };
    auto consumer = [&](FILE *file, int all_cycle, int all_thread_num, int MAX_SIZE, char *buffer1, char *buffer2) -> void
    {
        for (int i = 0; i < all_cycle; i++)
        {
            for (int j = 0; j < all_thread_num; j++)
            {
                std::unique_lock lk(producerThreads_mutex[j]);
                producerThreads_cv[j].wait(lk, [&, i]
                                           { return (i % 2 == 0) ? !producerThreads_buf1_can_write[j] : !producerThreads_buf2_can_write[j]; });
                char *buffer = (i % 2 == 0) ? buffer1 : buffer2;
                int start_pos = j * MAX_SIZE;
                int length = i % 2 == 0 ? producerThreads_len1[j] : producerThreads_len2[j];
                fwrite(&buffer[start_pos], sizeof(char), length, file);
                if (i % 2 == 0)
                {
                    producerThreads_buf1_can_write[j] = !producerThreads_buf1_can_write[j]; // 设置缓冲区1为不可写状态，即可读状态
                }
                else
                {
                    producerThreads_buf2_can_write[j] = !producerThreads_buf2_can_write[j]; // 设置缓冲区2为不可写状态，即可读状态
                }
                lk.unlock();
                producerThreads_cv[j].notify_one();
            }
        }
    };
    for (int i = 0; i < thread_num; ++i)
    {
        producerThreads_len1[i] = 0;
        producerThreads_len2[i] = 0;
        producerThreads_buf1_can_write[i] = 1;
        producerThreads_buf2_can_write[i] = 1;
        producerThreads[i] = std::thread(producer, data, every_cycle, i, all_cycle, last_cycle_every_thread_row , last_cycle_every_thread_row_rest, thread_num, &buffer[i * MAX_SIZE], &buffer2[i * MAX_SIZE]);

        // auto handle = producerThreads[i].native_handle();
        // int mask = SetThreadAffinityMask( (HANDLE)handle, 1<<(i) );
        // if(mask == 0){
        //     printf("error\n");
        // }

        // windows 不能使用
        //  cpu_set_t cpuset;
        //  CPU_ZERO(&cpuset);
        //  CPU_SET(i,&cpuset);
        //  int rc = pthread_setaffinity_np(producerThreads[i].native_handle(),sizeof(cpu_set_t), &cpuset);
        //  if (rc != 0) {
        //      std::cerr << "Error calling pthread_setaffinity_np: " << rc << "\n";
        //  }
    }
    std::thread consumer_thread(consumer, f, all_cycle + (last_cycle_num > 0) , thread_num, MAX_SIZE, buffer, buffer2);
    for (int i = 0; i < thread_num; ++i)
    {
        producerThreads[i].join();
    }
    consumer_thread.join();

    // std::vector<std::future<int>> all_task(thread_num + 1); // 1个线程写(即消费),其余线程生产
    // // setvbuf(f,0,_IOFBF,64*1024);//64KB缓冲区
    // if (all_cycle > 0)
    // {
    //     for (int __i = 0; __i <= (all_cycle); ++__i)
    //     {
    //         if (__i > 0)
    //         { // 第一次不执行，从buffer写入到文件
    //             int leng[thread_num];
    //             for (int thread_idx = 0; thread_idx < thread_num; ++thread_idx)
    //             {
    //                 all_task[thread_idx].wait(); // 获取每个线程要写入的长度
    //                 // get()会同步线程。等待前6个线程处理完成。
    //             }
    //             for (int thread_idx = 0; thread_idx < thread_num; ++thread_idx)
    //             {
    //                 leng[thread_idx] = all_task[thread_idx].get(); // 获取每个线程要写入的长度
    //                 // get()会同步线程。等待前6个线程处理完成。
    //             }
    //             char *buffer_ptr = ((__i % 2) == 0) ? buffer2 : buffer;
    //             all_task[thread_num] = std::async(std::launch::async, [__i, leng, &buffer_ptr, thread_num, MAX_SIZE, f]
    //                                               {
    //                                                   int write_byte_sum = 0;
    //                                                   for (int thread_idx = 0; thread_idx < thread_num; ++thread_idx)
    //                                                   {
    //                                                       int write_byte_num = fwrite(&(buffer_ptr[thread_idx * MAX_SIZE]), sizeof(char), leng[thread_idx], f); // 写入到文件
    //                                                       write_byte_sum += write_byte_num;
    //                                                   }
    //                                                   return write_byte_sum; // 这里没有用处，任何值都行
    //                                               });
    //         }
    //         if (__i < all_cycle)
    //         { // 最后一次不执行，从内存处理后写入到buffer
    //             char *buffer_ptr = ((__i % 2) == 0) ? buffer : buffer2;
    //             for (int thread_idx = 0; thread_idx < thread_num; ++thread_idx)
    //             {
    //                 all_task[thread_idx] = std::async(std::launch::async, [__i, &buffer_ptr, thread_idx, &data, every_cycle, MAX_SIZE, thread_num]
    //                                                   {
    //                 int all_byte_sum = 0;
    //                 char* char_ptr = &(buffer_ptr[thread_idx * MAX_SIZE]);
    //                 int j = 0;
    //                 int length=0;
    //                 for (int idx = __i * (every_cycle * thread_num) + thread_idx * every_cycle; j < every_cycle; (idx+=3),(j+=3)) {
    //                     //int byte_num = sprintf_s(char_ptr, ((4 + (16 + 8) * 3)), "%.16le %.16le %.16le\n", data[idx+0], data[idx+1], data[idx+2]);
    //                     // std::string str_tmp = std::format("{:.16e} {:.16e} {:.16e}\n", data[idx+0], data[idx+1], data[idx+2]);
    //                     // memcpy(char_ptr, str_tmp.c_str(), str_tmp.size()); // 转为C字符串写入buffer
    //                     // int byte_num=str_tmp.size();
    //                     length += d2sci(data[idx+0],&char_ptr[length]);char_ptr[length++]=' ';
    //                     length += d2sci(data[idx+1],&char_ptr[length]);char_ptr[length++]=' ';
    //                     length += d2sci(data[idx+2],&char_ptr[length]);char_ptr[length++]='\n';
    //                     //char_ptr += byte_num;
    //                     //all_byte_sum += byte_num;
    //                 }
    //                 all_byte_sum = length;
    //                 return all_byte_sum; });
    //             }
    //             if (__i > 0)
    //                 all_task[thread_num].wait(); // 需等待写入完成
    //         }
    //     }
    //     all_task[thread_num].wait(); // 等待最后一次写入完成。
    // }
    // if (all_cycle * every_cycle * thread_num < static_cast<int>(len))
    // {
    //     for (int _idx = all_cycle * every_cycle * thread_num; _idx < static_cast<int>(len); _idx += 3)
    //     {
    //         // fprintf_s(f, "%.16le %.16le %.16le\n", data[_idx + 0], data[_idx + 1], data[_idx + 2]);
    //         // std::string str_tmp = std::format("{:.16e} {:.16e} {:.16e}\n", data[_idx + 0], data[_idx + 1], data[_idx + 2]);
    //         // fwrite(str_tmp.c_str(), sizeof(char), str_tmp.size(), f);

    //         char buf[25 * 3];
    //         int length = 0;
    //         length += d2sci(data[_idx + 0], &buf[length]);
    //         buf[length++] = ' ';
    //         length += d2sci(data[_idx + 0], &buf[length]);
    //         buf[length++] = ' ';
    //         length += d2sci(data[_idx + 0], &buf[length]);
    //         buf[length++] = '\n';

    //         fwrite(buf, sizeof(char), length, f);
    //     }
    // }
    free(buffer);
    fclose(f);
}
void test_d2sci_32(double *data, int len)
{
    // char buffer[25*3*32];
    // const int K = 1000;
    std::vector<char> vbuffer(25 * 32 * 100);
    FILE *f = std::fopen("./data/d2sci_32_write.txt", "wb");
    int K = len / 32;
    for (int i = 0; i < len; i += 32 * 100)
    {
        int length = 0;
        for (int j = 0; j < 100; ++j)
        {
            length += d2sci_32(&data[i + j * 32], &vbuffer[length]);
        }
        fwrite(&vbuffer[0], sizeof(char), length, f);
    }
    fclose(f);
}
void bench_func(void (*funcname)(double *, int), double *data, int len, const char *alg_name)
{
    auto t1 = std::chrono::high_resolution_clock::now();

    funcname(data, len);

    auto t2 = std::chrono::high_resolution_clock::now();
    auto duration1 = std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count();
    printf("%s time: %lf ms,every double : %lf ns , bandwidth %lf MB/s\n", alg_name, duration1 / 1e6, 1.0 * duration1 / NUM_F64, (23.5 * NUM_F64 / 1024 / 1024) / (duration1 / 1e9));

    //
}

int main()
{
    // 创建随机数生成器
    std::random_device rd;     // 用于获取种子
    std::mt19937_64 gen(rd()); // 以随机设备 rd 初始化 Mersenne Twister 生成器

    auto t1 = std::chrono::high_resolution_clock::now();
    // generate random numbers
    std::vector<double> data(NUM_F64);
    for (int i = 0; i < NUM_F64; i++)
    {

        // uint64_t value = gen();
        // //value = (value & 0x800FFFFFFFFFFFFF) | ( uint64_t(0x7ff) <<52);
        // //while( ((value >> 52) & 0x7ff) == 0x7ff)value = gen();
        // data[i] = *(double*)&value;

        // print_double(data[i]);
        // data[i] = gen();
        //  uint64_t randnum1=gen();
        //  uint64_t randnum2=gen();
        uint64_t randnum3 = (gen() & 0x800FFFFFFFFFFFFF) | (((gen() % 200) + (1023 - 100)) << 52);
        data[i] = *(double *)&randnum3;
        // if(i<100) printf("%.16le\n",data[i]);
        // int cmp = data[i] < 0;
        // int cmp = value >> 63;
        // d2sci(data[i],buffer);
        // printf("buffer=%s\n",buffer);
        // printf("data[i] = %.16le cmp=%d\n",data[i],cmp);
    }
    auto t2 = std::chrono::high_resolution_clock::now();
    auto duration1 = std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count();
    // printf("%s time: %lf ms,every double : %lf ns\n", alg_name, duration1 / 1e6, 1.0 * duration1 / NUM_F64);

    printf("[====>>] finish generate random data time %lf ms\n", duration1 / 1e6);

    // bench_func(test_fprintf, data.data(), NUM_F64, "fprintf");
    // bench_func(test_format, data.data(), NUM_F64, "format");
    //bench_func(test_sprintf_multithread, data.data(), NUM_F64, "sprintf_multithread");
    //bench_func(test_format_multithread, data.data(), NUM_F64, "format_multithread");
    bench_func(test_d2sci, data.data(), NUM_F64, "d2sci");
    bench_func(test_d2sci_multithread, data.data(), NUM_F64, "d2sci_multithread");
    // bench_func(test_d2sci_32, data.data(), NUM_F64,"d2sci_32");
}